SF Fire Data (Incidents, Violations, and more)
Messing around with plotly visualization and ipywidgets
import os
import pandas as pd
import numpy as np
import re
import ipywidgets as widgets
import matplotlib.pyplot as plt
import seaborn as sns
from pylab import rcParams
%matplotlib inline
pd.options.display.max_columns = 40
sns.set(style='darkgrid', font_scale=1.5)
rcParams['figure.figsize'] = 14, 8
data = pd.read_csv('../../../data/sf-fires-data/fire-department-calls-for-service.csv',
low_memory=False)
data.columns = [i.lower().replace(' ', '_') for i in data.columns]
for i in data.columns:
if 'date' in i or 'dttm' in i:
print(i)
data[i] = pd.to_datetime(data[i])
data.shape
data.head()
pd.DataFrame(np.transpose([data.dtypes.values, data.isnull().sum().values]),
index=data.columns, columns=['dtype', 'isnull'])
data.describe(include='all')
call_types = data[['call_number', 'call_type']].drop_duplicates()
# check to see if each call number is only associated with ONE call type
call_types.call_number.value_counts()[0:5]
call_types.call_type.value_counts()
call_types.call_type.value_counts().plot(kind='bar')
plt.title('Distribution of Call Types')
plt.ylabel('Number of Calls')
plt.xlabel('Call Type')
plt.yscale('log')
call_dates = data[['call_number', 'call_date']].drop_duplicates()
call_dates.call_number.value_counts()[0:5]
month_map = {
1: 'January',
2: 'February',
3: 'March',
4: 'April',
5: 'May',
6: 'June',
7: 'July',
8: 'August',
9: 'September',
10: 'October',
11: 'November',
12: 'December'
}
call_dates['month'] = call_dates.call_date.dt.month.map(month_map)
fig1 = pd.DataFrame(call_dates.month.value_counts()).loc[list((map(month_map.get, range(1, 13))))]
plt.plot(fig1.index, fig1.month)
plt.title('Distribution of Incidents Throughout the Year')
plt.ylabel('Number of Calls')
plt.xlabel('Month')
plt.xticks(rotation=90)
plt.show()
# calls per month, year
calls_per_month = call_dates['call_date'].groupby([call_dates.call_date.dt.year, call_dates.call_date.dt.month]
).agg('count').to_frame()
calls_per_month['date'] = calls_per_month.index
calls_per_month.date = pd.to_datetime(calls_per_month.date, format='(%Y, %m)')
calls_per_month = calls_per_month.reset_index(drop=True)
calls_per_month.columns = ['num_calls', 'month']
calls_per_month.head()
# import plotly
import plotly.plotly as py
import plotly.graph_objs as go
# these two lines are what allow your code to show up in a notebook!
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode()
mapbox_access_token = 'pk.eyJ1IjoidGtoNTA0NCIsImEiOiJjanB1MnppNHowY2h4NDhxem1wYzQ0YnNvIn0.nVJ3ydUD6wePxdrWL8dvqA'
# sepcify that we want a scatter plot with, with date on the x axis and meet on the y axis
fig_data = [go.Scatter(x=calls_per_month.month,
y=calls_per_month.num_calls)]
# specify the layout of our figure
layout = go.Layout(title = "Number of Calls per Month",
xaxis = dict(title='Date',
ticklen=5,
zeroline=False))
# create and show our figure
fig = dict(data = fig_data,
layout = layout)
iplot(fig)
data.head()
call_loc = data[['call_number', 'call_type', 'call_date', 'address', 'zipcode_of_incident', 'location']].drop_duplicates()
call_loc.shape
call_loc.dtypes
call_loc.isnull().sum()
call_loc['text'] = 'Date: ' + call_loc.call_date.astype(str) + ', Address: ' + call_loc.address.astype(str) + ', Zip: ' + call_loc.zipcode_of_incident.fillna(0).astype(int).astype(str).replace('0', '')
call_loc.head()
lat = []
lon = []
for i in call_loc.location:
spl = i.split()
if 'latitude' in spl[2]:
lat.append(re.search(r'\d+.\d+', spl[3]).group())
else:
lat.append(np.nan)
if 'longitude' in spl[4]:
lon.append(re.search(r'\d+.\d+', spl[5]).group())
else:
lon.append(np.nan)
assert len(call_loc) == len(lat)
assert len(call_loc) == len(lon)
call_loc['latitude'] = lat
call_loc['longitude'] = lon
call_loc.isnull().sum()
call_loc.dtypes
call_loc.latitude = call_loc.latitude.astype(float)
call_loc.longitude = call_loc.longitude.astype(float) * -1
print('Latitude:')
print('min: ', min(call_loc.latitude.tolist()))
print('max: ', max(call_loc.latitude.tolist()))
print('mean:', np.mean(call_loc.latitude.tolist()))
print()
print('Longitude:')
print('min: ', min(call_loc.longitude.tolist()))
print('mean:', np.mean(call_loc.longitude.tolist()))
print('max: ', max(call_loc.longitude.tolist()))
# plot
# there are a lot of data points - lets filter on year and call_type
yr = widgets.IntSlider(
min=min(call_loc.call_date.dt.year),
max=max(call_loc.call_date.dt.year),
step=1,
description='Year:',
disabled=False,
continuous_update=False,
orientation='horizontal',
readout=True,
readout_format='d'
)
ct = widgets.Dropdown(
options=call_loc.call_type.unique().tolist(),
description='Call Type:',
disabled=False,
)
print('Choose parameters to visualize:')
display(yr)
display(ct)
print('Number of data points to plot based on parameters:',
len(call_loc[(call_loc.call_date.dt.year == yr.value) & (call_loc.call_type == ct.value)]))
# specify data
loc_data = call_loc[(call_loc.call_date.dt.year == yr.value) &
(call_loc.call_type == ct.value)]
# specify what we want our map to look like
fig_data = [
go.Scattermapbox(
lat=loc_data.latitude,
lon=loc_data.longitude,
mode='markers',
marker=dict(size=9),
text=loc_data.text,
)
]
# chart information
layout = go.Layout(
autosize=True,
hovermode='closest',
mapbox=dict(
accesstoken=mapbox_access_token,
bearing=0,
center=dict(
lat=37.75,
lon=-122.42
),
pitch=0,
zoom=10
),
)
# actually show our figure
fig = dict(data=fig_data,
layout=layout)
iplot(fig)
data.head()
# for i in data.call_number.unique():
# if len(data[data.call_number == i]) == 10:
call_num = pd.DataFrame(data.call_number.value_counts())
call_num[call_num['call_number'] == 10]
data[data.call_number == 100650261]
# priority - Code 2: Non-Emergency or Code 3: Emergency
data.final_priority.value_counts()
data['response_time'] = data.response_dttm - data.dispatch_dttm
data.head()
# average response times for each unit
data.unit_id.nunique()
resp_times = data[data.response_time.notnull()][['unit_id', 'response_time']]
resp_times.response_time = resp_times.response_time.dt.seconds
resp_times.groupby('unit_id').mean().sort_values('response_time')